# -*- coding: utf-8 -*-
"""
Created on Fri Dec 04 19:57:57 2015

@author: liangrongli
"""
# 获取本地url

def remove_tag(str):
    import re
    
    #删除 导航菜单
    rm_nav = re.compile('<nav .*?>.*?</nav>',re.S)
    str = re.sub(rm_nav,'',str)
    
    #删除 iframe
    rm_iframe = re.compile('<iframe .*?>.*?</iframe>',re.S)
    str = re.sub(rm_iframe,'',str)
    
    #删除 header
    rm_header = re.compile('<header>.*?<header>',re.S)
    str = re.sub(rm_header,'',str)
    
    #删除 breadcrumb
    rm_breadcrumb = re.compile('<div .*? id="breadcrumb"><div .*?>.*?</div></div>',re.S)
    str = re.sub(rm_breadcrumb,'',str)
    
    #删除 sidebar - <footer>
    rm_footer = re.compile('<div id="sidebar" .*?>(.*?)</div>.*?<footer>(.*?)</footer>',re.S)
    str = re.sub(rm_footer,'</div>',str)
    
    #删除 adds
    rm_adds = re.compile('<div class="post-adds">.*?<div .*? id="comment_list">.*?</div>',re.S)
    str = re.sub(rm_adds,'</div></div>',str)
    
    #删除 jiathis
    rm_jiathis = re.compile('<div .*? class="jiathis_style">.*?</div>',re.S)
    str = re.sub(rm_jiathis,'',str)
    
    return str.strip()

import urllib2
import re
import time
start = time.clock()

url = "file:///E:/python/t2.html"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:35.0) Gecko/20100101 Firefox/35.0"
headers = {"User-Agent":user_agent}

req = urllib2.Request(url,None,headers)
resp = urllib2.urlopen(req)
page_source = resp.read()

#filter_content = remove_tag(page_source)
#print page_source

p1 = re.compile('<div id="article-comment" .*?>(.*?)<div id="comment_list" .*?>.*?</div>',re.S)
top_nav = re.search(p1,page_source)
#print top_nav
print top_nav.group()
#print top_nav.group().string
#print dir(top_nav.group())
#直接把网页保存为本地文件
#urllib.urlretrieve(url,'43922.html')



end = time.clock()
print "run time is %f s" % (end-start)